home *** CD-ROM | disk | FTP | other *** search
- #
- # python code for building a parser from a grammar
- # Copyright Aaron Robert Watters, 1994
- #
-
- # BUGS:
- # A bad grammar that has no derivations for
- # the root nonterminal may cause a name error
- # on the variable "GoodStartingPlace"
-
- # this needs to be modified so the RULEGRAM is loaded from a
- # compiled representation if available.
-
- import string
- import kjSet
- import kjParser
- import regex
-
- # import some constants
- from kjParser import \
- TERMFLAG, NOMATCHFLAG, MOVETOFLAG, REDUCEFLAG, TRANSFLAG, KEYFLAG, \
- NONTERMFLAG, TERMFLAG, EOFFLAG, ENDOFFILETOKEN
-
- PMODULE = kjParser.THISMODULE
-
- # errors raised here
- TokenError = "TokenError" # may happen on autogen with bad grammar
- NotSLRError = "NotSLRError" # may happen for nonSLR grammar
-
- # set this flag for regression testing at each load
- RUNTESTS = 0
- # set this flag to abort automatic generation on Errors
- ABORTONERROR = 0
-
- # token used to mark null productions
- NULLTOKEN = (None,None)
-
-
- # a derived FSM class, with closure computation methods defined
- # (compilable FSMachine)
- #
- class CFSMachine(kjParser.FSMachine):
-
- def __init__(self, nonterm):
- kjParser.FSMachine.__init__(self, nonterm)
-
- # return the epsilon closure of the FSM as a new FSM
- #
- # DoNullMap, if set, will map unexpected tokens to
- # the "empty" state (usually creating a really big fsm)
- #
- def Eclosure(self, Epsilon, DoNullMaps=0):
- Closure = CFSMachine( self.root_nonTerminal )
-
- # compute the Epsilon Graph between states
- EGraph = kjSet.NewDG([])
- for State in range(0,self.maxState+1):
- # every state is E-connected to self
- kjSet.AddArc( EGraph, State, State )
- # add possible transition on epsilon (ONLY ONE SUPPORTED!)
- key = (State, Epsilon)
- if self.StateTokenMap.has_key(key):
- keymap = self.StateTokenMap[key]
- if keymap[0][0] != MOVETOFLAG:
- raise TypeError, "unexpected map type in StateTokenMap"
- for (Flag,ToState) in keymap:
- kjSet.AddArc( EGraph, State, ToState )
- #endfor
- # transitively close EGraph
- kjSet.TransClose( EGraph )
-
- # Translate EGraph into a dictionary of lists
- EMap = {}
- for State in range(0,self.maxState+1):
- EMap[State] = kjSet.Neighbors( EGraph, State )
-
- # make each e-closure of each self.state a state of the closure FSM.
- # here closure states assumed transient -- reset elsewhere.
- # first do the initial state
- Closure.States[ Closure.initial_state ] = \
- [TRANSFLAG, kjSet.NewSet(EMap[self.initial_state]) ]
- # do all other states (save initial and successful final states)
- #for State in range(0,self.maxState+1):
- # if State != self.initial_state \
- # and State != self.successful_final_state:
- # Closure.NewSetState(TRANSFLAG, kjSet.NewSet(EMap[State]) )
- ##endfor
-
- # compute set of all known tokens EXCEPT EPSILON
- Tokens = kjSet.NewSet( [] )
- for (State, Token) in self.StateTokenMap.keys():
- if Token != Epsilon:
- kjSet.addMember(Token, Tokens)
- # tranform it into a list
- Tokens = kjSet.get_elts(Tokens)
-
- # for each state of the the closure FSM (past final) add transitions
- # and add new states as needed until all states are processed
- # (uses convention that states are allocated sequentially)
- ThisClosureState = 1
- while ThisClosureState <= Closure.maxState:
- MemberStates = kjSet.get_elts(Closure.States[ThisClosureState][1])
- # for each possible Token, compute the union UTrans of all
- # e-closures for all transitions for all member states,
- # on the Token, make UTrans a new state (if needed),
- # and transition ThisClosureState to UTrans on Token
- for Token in Tokens:
- UTrans = kjSet.NewSet( [] )
- for MState in MemberStates:
- # if MState has a transition on Token, include
- # EMap for the destination state
- key = (MState, Token)
- if self.StateTokenMap.has_key(key):
- DStateTup = self.StateTokenMap[key]
- if DStateTup[0][0] != MOVETOFLAG:
- raise TypeError, "unknown map type"
- for (DFlag, DState) in DStateTup:
- for EDState in EMap[DState]:
- kjSet.addMember(EDState, UTrans)
- #endif
- #endfor MState
- # register UTrans as a new state if needed
- UTState = Closure.NewSetState(TRANSFLAG, UTrans)
- # record transition from
- # ThisClosureState to UTState on Token
- if DoNullMaps:
- Closure.SetMap( ThisClosureState, Token, UTState)
- else:
- if not kjSet.Empty(UTrans):
- Closure.SetMap( ThisClosureState, Token, UTState)
- #endfor Token
- ThisClosureState = ThisClosureState +1
- #endwhile
- return Closure
- #enddef Eclosure
-
- # add an set-marked state to self if not present
- # uses self.States[s][1] as the set marking the state s
- #
- # only used by Eclosure above
- #
- def NewSetState(self, kind, InSet):
- # return existing state if one is present that matches the set
- LastState= self.maxState
- # skip state 0 (successful final state)???
- for State in range(1,LastState+1):
- MarkSet = self.States[State][1]
- if kjSet.Same(InSet,MarkSet):
- return State # nonlocal
- #endfor
- # if not exited then allocate a new state
- LastState = LastState + 1
- self.States[LastState] = [ kind , InSet ]
- self.maxState = LastState
- return LastState
- #enddef newSetState
-
- #endclass CFSMachine
-
-
- # Ruleset class, used to compute NFA and then DFA for
- # parsing based on a list of rules.
- #
- class ruleset:
-
- def __init__(self, StartNonterm, Rulelist):
- # initialize the ruleset
- self.StartNonterm = StartNonterm
- self.Rules = Rulelist
-
- # method to compute prefixes and First sets for nonterminals
- def CompFirst(self):
- # uses the special null production token NULLTOKEN
- # snarfed directly from Aho+Ullman (terminals glossed)
- First = kjSet.NewDG( [] )
- # repeat the while loop until no change is made to First
- done = 0
- while not done:
- done = 1 # assume we're done until a change is made to First
-
- # iterate through all rules looking for a new arc to add
- # indicating Terminal > possible first token derivation
- #
- for R in self.Rules:
- GoalNonterm = R.Nonterm
- Bodylength = len(R.Body)
- # look through the body of the rule up to the token with
- # no epsilon production (yet seen)
- Bodyindex = 0
- Processindex = 1
- while Processindex:
- # unless otherwise indicated below, don't go to next token
- Processindex = 0
-
- # if index is past end of body then record
- # an epsilon production for this nonterminal
- if Bodyindex >= Bodylength:
- if not kjSet.HasArc(First, GoalNonterm, NULLTOKEN ):
- kjSet.AddArc( First, GoalNonterm, NULLTOKEN )
- done = 0 # change made to First
- else:
- # otherwise try to add firsts of this token
- # to firsts of the Head of the rule.
- Token = R.Body[Bodyindex]
- (type, name) = Token
- if type in (KEYFLAG,TERMFLAG):
- # try to add this terminal to First for GoalNonterm
- if not kjSet.HasArc(First, GoalNonterm, Token):
- kjSet.AddArc( First, GoalNonterm, Token)
- done = 0
- elif type == NONTERMFLAG:
- # try to add each First entry for nonterminal
- # to First entry for GoalNonterm
- for FToken in kjSet.Neighbors( First, Token ):
- if not kjSet.HasArc(First, GoalNonterm, FToken):
- kjSet.AddArc( First, GoalNonterm, FToken)
- done = 0
- # does this nonterminal have a known e production?
- if kjSet.HasArc( First, Token, NULLTOKEN ):
- # if so, process next token in rule
- Processindex = 1
- else:
- raise TokenError, "unknown token type in rule body"
- #endif
- Bodyindex = Bodyindex + 1
- #endwhile Processindex
- #endfor R in self.Rules
- #endwhile not done
- self.First = First
- #enddef CompFirst
-
- # computing the Follow set for the ruleset
- # the good news: I think it's correct.
- # the bad news: It's slower than it needs to be for epsilon cases.
- def CompFollow(self):
- Follow = kjSet.NewDG( [] )
-
- # put end marker on follow of start nonterminal
- kjSet.AddArc(Follow, self.StartNonterm, kjParser.ENDOFFILETOKEN)
-
- # now compute other follows using the rules;
- # repeat the loop until no change to Follow.
- done = 0
- while not done:
- done = 1 # assume done unless Follow changes
- for R in self.Rules:
- #print R
- # work backwards in the rule body to
- # avoid retesting for epsilon nonterminals
- Bodylength = len(R.Body)
- EpsilonTail = 1 # the tail of rule may expand to null
- BodyIndex = Bodylength - 1
- Last = 1 # loop starts at the last
- from types import TupleType
- while BodyIndex >= 0:
- Token = R.Body[BodyIndex]
- (Ttype,Tname) = Token
- if Ttype in (KEYFLAG,TERMFLAG):
- # keywords etc cancel epsilon tail, otherwise ignore
- EpsilonTail = 0
- elif Ttype == NONTERMFLAG:
- # if the tail expands to epsilon, map
- # follow for the goal nonterminal to this token
- # and also follow for the tail nonterms
- if EpsilonTail:
- # add follow for goal
- for FToken in kjSet.Neighbors(Follow,R.Nonterm):
- if not kjSet.HasArc(Follow,Token,FToken):
- kjSet.AddArc(Follow,Token,FToken)
- #if type(FToken[0])==TupleType:
- # raise ValueError, "bad FToken"+`FToken`
- #print "new", Token, FToken
- done = 0 # follow changed, loop again
- # add follow for tail members
- #for Index2 in range(BodyIndex+1, Bodylength):
- # TailToken = R.Body[Index2]
- # for FToken in kjSet.Neighbors(Follow,TailToken):
- # if not kjSet.HasArc(Follow,Token,FToken):
- # kjSet.AddArc(Follow,Token,FToken)
- # done = 0
- #endif EpsilonTail
-
- # if we are not at the end use First set for next token
- if not Last:
- NextToken = R.Body[BodyIndex+1]
- (NTtype, NTname) = NextToken
- if NTtype in (KEYFLAG,TERMFLAG):
- if not kjSet.HasArc(Follow,Token,NextToken):
- kjSet.AddArc(Follow,Token,NextToken)
- #print "next", Token, NextToken
- done = 0
- elif NTtype == NONTERMFLAG:
- for FToken in kjSet.Neighbors(self.First, NextToken):
- if FToken != NULLTOKEN:
- if not kjSet.HasArc(Follow,Token,FToken):
- kjSet.AddArc(Follow,Token,FToken)
- #print "neighbor", Token, FToken
- done = 0
- else:
- # next token expands to epsilon:
- # add its follow, unless already done above
- #if not EpsilonTail:
- for FToken in kjSet.Neighbors(Follow,NextToken):
- if not kjSet.HasArc(Follow,Token,FToken):
- kjSet.AddArc(Follow,Token,FToken)
- #print "epsilon", Token, FToken
- done = 0
- else:
- raise TokenError, "unknown token type in rule body"
- #endif not Last
-
- # finally, check whether next iteration has epsilon tail
- if not kjSet.HasArc(self.First, Token, NULLTOKEN):
- EpsilonTail = 0
- else:
- raise TokenError, "unknown token type in rule body"
-
- BodyIndex = BodyIndex - 1
- Last = 0 # no longer at the last token of the rule
- #endwhile
- #endfor
- #endwhile
- self.Follow = Follow
- #enddef CompFollow
-
- def DumpFirstFollow(self):
- First = self.First
- Follow = self.Follow
- print "First:"
- for key in First.keys():
- name = key[1]
- print name," :: ",
- for (flag2,name2) in First[key].keys():
- print name2,", ",
- print
- print "Follow:"
- for key in Follow.keys():
- name = key[1]
- print name," :: ",
- for (flag2,name2) in Follow[key].keys():
- print name2,", ",
- print
-
- # computing the "first" of the tail of a rule followed by an
- # optional terminal
- # doesn't include NULLTOKEN
- # requires self.First to be computed
- #
- def FirstOfTail(self, Rule, TailIndex, Token=None):
- Result = kjSet.NewSet( [] )
- # go through all tokens in rule tail so long as there is a
- # null derivation for the remainder
- Nullprefix = 1
- BodyLength = len(Rule.Body)
- ThisIndex = TailIndex
- while Nullprefix and ThisIndex < BodyLength:
- RToken = Rule.Body[ThisIndex]
- (RTtype, RTname) = RToken
- if RTtype == NONTERMFLAG:
- for FToken in kjSet.Neighbors(self.First, RToken):
- if FToken != NULLTOKEN:
- kjSet.addMember(FToken, Result)
- #endfor
- # check whether this symbol might have a null production
- if not kjSet.HasArc(self.First, RToken, NULLTOKEN):
- Nullprefix = 0
- elif RTtype in [KEYFLAG, TERMFLAG]:
- kjSet.addMember(RToken, Result)
- Nullprefix = 0
- else:
- raise TokenError, "unknown token type in rule body"
- ThisIndex = ThisIndex + 1
- #endwhile
- # add the optional token if given and Nullprefix still set
- if Nullprefix and Token != None:
- kjSet.addMember(Token, Result)
- return Result
- #enddef FirstOfTail
-
- # compute an SLR NFA for the ruleset with states for each SLR "item"
- # and transitions, eg:
- # X > .AB
- # on A maps to X > A.B
- # on epsilon maps to A > .ZC
- # and A > .WK
- # an item is a pair (rulenumber, bodyposition)
- # where body position 0 is interpreted to point before the
- # beginning of the body.
- #
- # SLR = "simple LR" in Aho+Ullman terminology
- #
- def CompSLRNFA(self):
- NFA = CFSMachine(self.StartNonterm)
- Nrules = len(self.Rules)
- itemStateMap = {}
- for Ruleindex in range(0,Nrules):
- Rule = self.Rules[Ruleindex]
- # make an item for each "dot" position in the body
- for DotPos in range(0, len(Rule.Body) + 1):
- item = (Ruleindex, DotPos)
- itemState = NFA.NewState(TRANSFLAG, [item])
- itemStateMap[item] = itemState
- #endfor DotPos
- #endfor Ruleindex
-
- # now that the states are initialized
- # compute transitions except for the last item of a rule
- # (which has none)
- for Ruleindex in range(0,Nrules):
- Rule = self.Rules[Ruleindex]
- for DotPos in range(0, len(Rule.Body)):
- item = (Ruleindex, DotPos)
- CurrentToken = Rule.Body[DotPos]
- ThisState = itemStateMap[item]
- NextState = itemStateMap[ (Ruleindex, DotPos + 1) ]
- NFA.SetMap( ThisState, CurrentToken, NextState )
- # if the current token is a nonterminal
- # ad epsilon transitions to first item for any
- # rule that derives this nonterminal
- (CTtype, CTname) = CurrentToken
- if CTtype == NONTERMFLAG:
- for Rule2index in range(0,Nrules):
- Rule2 = self.Rules[Rule2index]
- Head = Rule2.Nonterm
- if Head == CurrentToken:
- NextState = itemStateMap[( Rule2index, 0 )]
- NFA.SetMap( ThisState, NULLTOKEN, NextState )
- #endfor Rule2index
- #endif CTtype == NONTERMFLAG
- #endfor DotPos
- #endfor Ruleindex
-
- # must handle the initial state properly here!
- # Make a dummy state with e-transitions to all first items
- # for rules that derive the initial nonterminal
- ThisState = NFA.initial_state
- GoodStartingPlace = None
- for Ruleindex in range(0,Nrules):
- Rule = self.Rules[Ruleindex]
- Head = Rule.Nonterm
- if Head == self.StartNonterm:
- GoodStartingPlace= (Ruleindex, 0)
- NextState = itemStateMap[ GoodStartingPlace ]
- NFA.SetMap( ThisState, NULLTOKEN, NextState )
- # fix the NFA.States entry
- if GoodStartingPlace == None:
- raise NotSLRError, "No derivation for root nonterminal."
- NFA.States[ NFA.initial_state ] = \
- [ 'transient', GoodStartingPlace ]
-
- self.SLRNFA = NFA
- #enddef CompSLRNFA
-
- # dump an item
- def ItemDump(self, item):
- (ruleindex, position) = item
- Rule = self.Rules[ruleindex]
- print Rule.Nonterm[1],' >> ',
- for bindex in range(0, len(Rule.Body)):
- if position == bindex:
- print " (*) ",
- print Rule.Body[bindex][1],
- if position == len(Rule.Body):
- print " (*) "
- else:
- print
-
- # utility function -- returns true if an item is a final item
- def SLRItemIsFinal(self, item):
- (ruleindex, position) = item
- Rule = self.Rules[ruleindex]
- if position == len(Rule.Body):
- return 1
- else:
- return 0
-
- # dump the NFA
- def DumpSLRNFA(self):
- NFA = self.SLRNFA
- print "root: ", NFA.root_nonTerminal
- for key in NFA.StateTokenMap.keys():
- map = NFA.StateTokenMap[key]
- (fromstate, token) = key
- fromitem = NFA.States[ fromstate ][1]
- self.ItemDump(fromitem)
- print " on ", token[1], " maps "
- for Tostate in map:
- Toitem = NFA.States[Tostate][1]
- print " ",
- self.ItemDump(Toitem)
-
- # compute DFA for ruleset by computing the E-closure of the
- # NFA
- def CompDFA(self):
- self.DFA = self.SLRNFA.Eclosure(NULLTOKEN)
-
- def DumpDFAsets(self):
- DFA = self.DFA
- print "root: ", DFA.root_nonTerminal
- for State in range(1, len(DFA.States) ):
- self.DumpItemSet(State)
-
- def DumpItemSet(self,State):
- DFA = self.DFA
- NFA = self.SLRNFA
- print
- print "STATE ", State, " *******"
- fromNFAindices = kjSet.get_elts(DFA.States[State][1])
- for NFAindex in fromNFAindices:
- item = NFA.States[NFAindex][1]
- print " ", NFAindex, ": ",
- self.ItemDump(item)
-
- # this function completes the computation of an SLR DFA
- # by adding reduction states for each DFA state S containing
- # item H > B.
- # which reduces rule H > B
- # for each token T in Follow of H.
- # if S already has a transition for T then there is a conflict!
- #
- # assumes DFA and SLRNFA and Follow have been computed.
- #
- def SLRFixDFA(self):
- DFA = self.DFA
- NFA = self.SLRNFA
- # look through the states (except 0=success) of the DFA
- # initially don't add any new states, just record
- # actions to be done
- # uses convention that 0 is successful final state
-
- # ToDo is a dictionary which maps
- # (State, Token) to a item to reduce
- ToDo = {}
- Error = None
- for State in range(1, len(DFA.States) ):
- # look for a final item for a rule in this state
- fromNFAindices = kjSet.get_elts(DFA.States[State][1])
- for NFAindex in fromNFAindices:
- item = NFA.States[NFAindex][1]
- # if the item is final remember to do the reductions...
- if self.SLRItemIsFinal(item):
- (ruleindex, position) = item
- Rule = self.Rules[ruleindex]
- Head = Rule.Nonterm
- Following = kjSet.Neighbors( self.Follow, Head )
- for Token in Following:
- key = (State, Token)
- if not ToDo.has_key(key):
- ToDo[ key ] = item
- else:
- # it might be okay if the items are identical?
- item2 = ToDo[key]
- if item != item2:
- print "reduce/reduce conflict on ",key
- self.ItemDump(item)
- self.ItemDump(item2)
- Error = " apparent reduce/reduce conflict"
- #endif
- #endfor
- #endif
- #endfor NFAindex
- #endfor State
-
- # for each (State,Token) pair which indicates a reduction
- # record the reduction UNLESS the map is already set for the pair
- for key in ToDo.keys():
- (State,Token) = key
- item = ToDo[key]
- (rulenum, dotpos) = item
- ExistingMap = DFA.map( State, Token )
- if ExistingMap[0] == NOMATCHFLAG:
- DFA.SetReduction( State, Token, rulenum )
- else:
- print "apparent shift/reduce conflict"
- print "reduction: ", key, ": "
- self.ItemDump(item)
- print "existing map ", ExistingMap
- Error = " apparent shift/reduce conflict"
- #endfor
- if Error and ABORTONERROR:
- raise NotSLRError, Error
- #enddef SLRfixDFA()
-
- # do complete SLR DFA creation starting after initialization
- def DoSLRGeneration(self):
- self.CompFirst()
- self.CompFollow()
- self.CompSLRNFA()
- self.CompDFA()
- self.SLRFixDFA()
-
- #endclass ruleset
-
-
- ################ the following are interpretation functions
- ################ used by RULEGRAM meta grammar
- # some constants used here
- COMMENTFORM = "##.*\n"
- RSKEY = "@R"
- COLKEY = "::"
- LTKEY = ">>"
- IDNAME = "ident"
- # an identifier in the meta grammar is any nonwhite string
- # except the keywords @R :: >> or comment flag ##
- IDFORM = "[^" + string.whitespace + "]+"
-
- # for identifiers simply return the string
- def IdentFun(string):
- return string
-
- # RootReduction should receive list of form
- # [ nontermtoken, keyword COLKEY, RuleList ]
- def RootReduction(list, ObjectGram):
- if len(list) != 3 or list[1] != COLKEY:
- raise FlowError, "unexpected metagrammar root reduction"
- return (list[0], list[2])
-
- # NullRuleList should receive list of form
- # []
- def NullRuleList(list, ObjectGram):
- if list != []:
- raise FlowError, "unexpected null RuleList form"
- return []
-
- # FullRuleList should receive list of form
- # [ Rule, RuleList ]
- def FullRuleList(list, ObjectGram):
- if type(list) != type([]) or len(list)!=2:
- raise FlowError, "unexpected full RuleList form"
- NewRule = list[0]
- OldRules = list[1]
- return [NewRule] + OldRules
-
- # InterpRule should receive list of form
- # [keyword RSKEY,
- # RuleNameStr,
- # keyword COLKEY,
- # Nontermtoken,
- # keyword LTKEY,
- # Bodylist]
- #
- def InterpRule(list, ObjectGram):
- # check keywords:
- if len(list) != 6 or \
- list[0] != RSKEY or \
- list[2] != COLKEY or \
- list[4] != LTKEY:
- raise FlowError, "unexpected meta rule reduction form"
- ruleName = list[1]
- ruleNonterm = list[3]
- ruleBody = list[5]
- # upcase the the representation of keywords if needed
- if not ObjectGram.LexD.isCaseSensitive():
- for i in range(0,len(ruleBody)):
- (flag, name) = ruleBody[i]
- if flag == KEYFLAG:
- ruleBody[i] = (KEYFLAG, string.upper(name))
- elif not flag in (TERMFLAG, NONTERMFLAG):
- raise FlowError, "unexpected rule body member"
- rule = kjParser.ParseRule( ruleNonterm, ruleBody )
- rule.Name = ruleName
- return rule
-
- # InterpRuleName should receive
- # [ string ]
- def InterpRuleName(list, ObjectGram):
- #print list
- # add error checking?
- return list[0]
-
- # InterpNonTerm should receive
- # [ string ]
- def InterpNonTerm(list, ObjectGram):
- #print list
- if type(list)!=type([]) or len(list)!=1:
- raise FlowError, "unexpected rulename form"
- Name = list[0]
- # determine whether this is a valid nonterminal
- if not ObjectGram.NonTermDict.has_key(Name):
- #print Name
- raise TokenError, "LHS of Rule must be nonterminal: "+Name
- return ObjectGram.NonTermDict[Name]
-
- # NullBody should receive []
- def NullBody(list, ObjectGram):
- #print list
- if list != []:
- raise FlowError, "unexpected null Body form"
- return []
-
- # FullBody should receive
- # [ string, Bodylist]
- # must determine whether the string represents
- # a keyword, a nonterminal, or a terminal of the object
- # grammar.
- # returns (KEYFLAG, string) (TERMFLAG, string) or
- # (NONTERMFLAG, string) respectively
- #
- def FullBody(list,ObjectGram):
- #print list
- if type(list)!=type([]) or len(list)!=2:
- raise FlowError, "unexpected body form"
- Name = list[0]
- # Does the Name rep a nonterm, keyword or term
- # of the object grammar (in that order).
- if ObjectGram.NonTermDict.has_key(Name):
- kind = NONTERMFLAG
- elif ObjectGram.LexD.keywordmap.has_key(Name):
- kind = KEYFLAG
- elif ObjectGram.TermDict.has_key(Name):
- kind = TERMFLAG
- else:
- raise TokenError, "Rule body contains unregistered string: "+Name
- restOfBody = list[1]
- return [(kind, Name)] + restOfBody
-
- # function to generate a grammar for parsing grammar rules
- #
- def ruleGrammar():
- LexD = kjParser.LexDictionary()
- # use SQL/Ansi style comments
- LexD.comment( COMMENTFORM )
- # declare keywords
- RStart = LexD.keyword( RSKEY )
- TwoColons = LexD.keyword( COLKEY )
- LeadsTo = LexD.keyword( LTKEY )
- # declare terminals
- ident = LexD.terminal(IDNAME, IDFORM, IdentFun )
- # declare nonterminals
- Root = kjParser.nonterminal("Root")
- Rulelist = kjParser.nonterminal("RuleList")
- Rule = kjParser.nonterminal("Rule")
- RuleName = kjParser.nonterminal("RuleName")
- NonTerm = kjParser.nonterminal("NonTerm")
- Body = kjParser.nonterminal("Body")
-
- # declare rules
- # Root >> NonTerm :: Rulelist
- InitRule = kjParser.ParseRule( Root, \
- [NonTerm, TwoColons, Rulelist], RootReduction )
- # Rulelist >>
- RLNull = kjParser.ParseRule( Rulelist, [], NullRuleList)
- # Rulelist >> Rule Rulelist
- RLFull = kjParser.ParseRule( Rulelist, [Rule,Rulelist], FullRuleList)
- # Rule >> "@R :: NonTerm >> Body
- RuleR = kjParser.ParseRule( Rule, \
- [RStart, RuleName, TwoColons, NonTerm, LeadsTo, Body],\
- InterpRule)
- # Rulename >> ident
- RuleNameR = kjParser.ParseRule( RuleName, [ident], InterpRuleName)
- # NonTerm >> ident
- NonTermR = kjParser.ParseRule( NonTerm, [ident], InterpNonTerm)
- # Body >>
- BodyNull = kjParser.ParseRule( Body, [], NullBody)
- # Body >> ident Body
- BodyFull = kjParser.ParseRule( Body, [ident,Body], FullBody)
-
- # declare Rules list and Associated Name dictionary
- Rules = [RLNull, RLFull, RuleR, RuleNameR, NonTermR,\
- BodyNull, BodyFull, InitRule]
- RuleDict = \
- { "RLNull":0, "RLFull":1, "RuleR":2, "RuleNameR":3, \
- "NonTermR":4, "BodyNull":5, "BodyFull":6 , "InitRule":7 }
- # make the RuleSet and compute the associate DFA
- RuleSet = ruleset( Root, Rules )
- RuleSet.DoSLRGeneration()
- # construct the Grammar object
- Result = kjParser.Grammar( LexD, RuleSet.DFA, Rules, RuleDict )
- return Result
-
- #enddef RuleGrammar()
-
-
- # this is the rule grammar object for
- # parsing
- RULEGRAM = ruleGrammar()
-
- # a derived grammar class (object oriented programming is cool!)
- # this is a compilable grammar for automatic parser generation.
- #
- class CGrammar(kjParser.Grammar):
-
- # initialization is handled by the base class
-
- # insert a white separated list of keywords into the LexD
- # THIS SHOULD CHECK FOR KEYWORD/NONTERMINAL/PUNCT NAME
- # COLLISIONS (BUT DOESN'T YET).
- def Keywords(self, Stringofkeys):
- keywordlist = string.split(Stringofkeys)
- for keyword in keywordlist:
- self.LexD.keyword( keyword )
-
- # insert a string of punctuations into the LexD
- def punct(self, Stringofpuncts):
- for p in Stringofpuncts:
- self.LexD.punctuation(p)
-
- # register a list of regular expression strings
- # to represent comments in LexD
- def comments(self, listOfCommentStrings):
- for str in listOfCommentStrings:
- self.LexD.comment(str)
-
- # register a white separated list of nonterminal strings
- def Nonterms(self, StringofNonterms):
- nonTermlist = string.split(StringofNonterms)
- for NonTerm in nonTermlist:
- self.NonTermDict[NonTerm] = kjParser.nonterminal(NonTerm)
-
- # initialize or add more rules to the RuleString
- def Declarerules(self, StringWithRules):
- self.RuleString = self.RuleString + "\n" + StringWithRules
-
- # The compilation function assumes
- # NonTermDict
- # RuleString
- # LexD
- # TermDict
- # have all been set up properly
- # (at least if the default MetaGrammar is used).
- # On successful completion it will set up
- # DFA
- # RuleL
- # RuleNameToIndex
- def Compile(self, MetaGrammar=RULEGRAM):
- # the following should return a list of rules
- # with punctuations of self.LexD interpreted as trivial keywords
- # keywords of seld.LexD interpreted as keywords
- # and nonterminals registered in NonTermDict interpreted as
- # nonterms.
- # ParseResult should be of form ( (rootNT, RuleL), self )
- ParseResult = MetaGrammar.DoParse1( self.RuleString, self )
- (RootNonterm, Rulelist) = ParseResult
-
- # make a ruleset and compute its DFA
- RuleS = ruleset( RootNonterm, Rulelist )
- RuleS.DoSLRGeneration()
-
- # make the rulename to index map to allow future bindings
- for i in range(0,len(Rulelist)):
- Rule = Rulelist[i]
- self.RuleNameToIndex[ Rule.Name ] = i
-
- # fill in the blanks
- self.DFA = RuleS.DFA
- self.RuleL = Rulelist
-
- # FOR DEBUG AND TESTING
- self.Ruleset = RuleS
-
- # DON'T clean up the grammar (misc structures are used)
- # in future bindings
- #enddef Compile
-
-
- # Write a reconstructable representation for this grammar
- # to a file
- #EXCEPT:
- # - rule associations to reduction functions
- # will be lost (must be reset elsewhere)
- # - terminals in the lexical dictionary
- # will not be initialized
- #
- # IND is used for indentation, should be whitespace (add check!)
- #
- # FName if given will cause the reconstructed to be placed
- # inside a function `FName`+"()" returning the grammar object
- #
- # NOTE: this function violates information hiding principles;
- # in particular it "knows" the guts of the FSM and LexD classes
- #
- def Reconstruct(self, VarName, Tofile, FName=None, indent=""):
- Reconstruction = codeReconstruct(VarName, Tofile, self, FName, indent)
- GrammarDumpSequence(Reconstruction)
- #enddef Reconstruct
-
- # marshalling of a grammar to a file
- def MarshalDump(self, Tofile):
- Reconstruction = marshalReconstruct(self, Tofile)
- GrammarDumpSequence(Reconstruction)
-
- #endclass CGrammar
-
- # general procedure for different types of archiving for grammars
- def GrammarDumpSequence(ReconstructObj):
- # assume an initialized Reconstruct Object with appropriate grammar etc.
- # put the lexical part
- ReconstructObj.PutLex()
- # put the rules
- ReconstructObj.PutRules()
- # put transitions
- ReconstructObj.PutTransitions()
- # finish up
- ReconstructObj.Cleanup()
-
- # function to create a "null CGrammar"
- def NullCGrammar():
- return CGrammar(None,None,None,{})
-
- # utility classes -- Grammar reconstruction objects
- # encapsulate the process of grammar archiving.
- #
- class Reconstruct:
-
- # this "virtual class" is only for common behaviors of subclasses.
- def MakeTokenArchives(self):
- # make a list of all tokens and
- # initialize token > int dictionary
- keys = self.Gram.DFA.StateTokenMap.keys()
- tokenToInt = {}
- tokenSet = kjSet.NewSet([])
- for k in keys:
- kjSet.addMember(k[1], tokenSet)
- tokens = kjSet.get_elts(tokenSet)
- for i in range(0,len(tokens)):
- tokenToInt[ tokens[i] ] = i
-
- self.keys = keys
- self.tokens = tokens # global sub
- self.tokInt = tokenToInt # global sub
-
- # grammar reconstruction to a file
- class codeReconstruct(Reconstruct):
-
- def __init__(self, VarName, Tofile, Grammar, FName=None, indent =""):
- # do global subs for each of these
- self.Var = VarName
- self.File = Tofile
- self.FName = FName
- self.Gram = Grammar
-
- # put the reconstruction in a function if FName is given
- if FName != None:
- Tofile.write("\n\n")
- Tofile.write(indent+"def "+FName+"():\n")
- IND = indent+" "
- else:
- IND = indent
- self.I = IND # global sub!
- Tofile.write("\n\n")
- Tofile.write(IND+"# ******************************BEGIN RECONSTRUCTION\n")
- Tofile.write(IND+"# Python declaration of Grammar variable "+VarName+".\n")
- Tofile.write(IND+"# automatically generated by module "+PMODULE+".\n")
- Tofile.write(IND+"# Altering this sequence by hand will probably\n")
- Tofile.write(IND+"# leave it unusable.\n")
- Tofile.write(IND+"#\n")
- Tofile.write(IND+"import "+PMODULE+"\n\n")
- Tofile.write(IND+"# variable declaration:\n")
- Tofile.write(IND+VarName+"= "+PMODULE+".NullGrammar()\n\n")
-
- # make self.keys list of dfa keys,
- # self.tokens list of grammar tokens,
- # self.tokInt inverted dictionary for self.tokens
- self.MakeTokenArchives()
-
- Tofile.write("\n\n"+IND+"# case sensitivity behavior for keywords.\n")
- if self.Gram.LexD.isCaseSensitive():
- Tofile.write(IND+VarName+".SetCaseSensitivity(1)\n")
- else:
- Tofile.write(IND+VarName+".SetCaseSensitivity(0)\n")
- #enddef __init__
-
- def PutLex(self):
- IND = self.I
- Tofile = self.File
- VarName = self.Var
- LexD = self.Gram.LexD
- tokens = self.tokens
-
- Tofile.write("\n\n"+IND+"# declaration of lexical dictionary.\n")
- Tofile.write(IND+"# EXCEPT FOR TERMINALS\n")
- Tofile.write(IND+VarName+".LexD.punctuationlist = ")
- Tofile.write(`LexD.punctuationlist`+"\n")
- Tofile.write(IND+"# now comment patterns\n")
- for comment in LexD.commentstrings:
- Tofile.write(IND+VarName+".LexD.comment("+`comment`+")\n")
- Tofile.write(IND+"# now define tokens\n")
- for i in range(0,len(tokens)):
- tok = tokens[i]
- (kind, name) = tok
- if kind == TERMFLAG:
- # put warning at end!
- # nonterminal not installed in lexical dictionary here!
- Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ")
- Tofile.write(PMODULE+".termrep("+`name`+")\n")
- elif kind == KEYFLAG:
- Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ")
- Tofile.write(VarName+".LexD.keyword("+`name`+")\n")
- elif kind == NONTERMFLAG:
- Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ")
- Tofile.write(PMODULE+".nonterminal("+`name`+")\n")
- else:
- raise FlowError, "unknown token type"
- #enddef PutLex
-
- def PutRules(self):
- IND = self.I
- VarName = self.Var
- Rules = self.Gram.RuleL
- Tofile = self.File
- Root = self.Gram.DFA.root_nonTerminal
- Tofile.write("\n\n"+IND+"# declaration of rule list with names.\n")
- Tofile.write(IND+"# EXCEPT FOR INTERP FUNCTIONS\n")
- nrules = len(Rules)
- Tofile.write(IND+VarName+".RuleL = [None] * "+`nrules`+"\n")
- for i in range(0,nrules):
- # put warning at end:
- # rule reduction function not initialized here!
- rule = Rules[i]
- name = rule.Name
- Tofile.write(IND+"rule = "+`rule`+"\n")
- Tofile.write(IND+"name = "+`name`+"\n")
- Tofile.write(IND+"rule.Name = name\n")
- Tofile.write(IND+VarName+".RuleL["+`i`+"] = rule\n")
- Tofile.write(IND+VarName+".RuleNameToIndex[name] = "+`i`+"\n")
-
- Tofile.write("\n\n"+IND+"# DFA root nonterminal.\n")
- Tofile.write(IND+VarName+".DFA.root_nonTerminal =")
- Tofile.write(`Root`+"\n")
- #enddef PutRules
-
- def PutTransitions(self):
- IND = self.I
- Tofile = self.File
- VarName = self.Var
- maxState = self.Gram.DFA.maxState
- tokenToInt = self.tokInt
- StateTokenMap = self.Gram.DFA.StateTokenMap
- keys = self.keys
-
- Tofile.write("\n\n"+IND+"# DFA state declarations.\n")
- for state in range(1, maxState+1):
- Tofile.write(IND+VarName+".DFA.States["+`state`+"] = ")
- Tofile.write('['+`TRANSFLAG`+']\n')
- Tofile.write(IND+VarName+".DFA.maxState = "+`maxState`+"\n")
-
- Tofile.write("\n\n"+IND+"# DFA transition declarations.\n")
- for key in keys:
- (fromState, TokenRep) = key
- TokenIndex = tokenToInt[TokenRep]
- TokenArg = VarName+".IndexToToken["+`TokenIndex`+"]"
- TMap = StateTokenMap[key]
- TMaptype = TMap[0][0]
- if TMaptype == REDUCEFLAG:
- # reduction
- rulenum = TMap[0][1]
- Args = "("+`fromState`+","+TokenArg+","+`rulenum`+")"
- Tofile.write(IND+VarName+".DFA.SetReduction"+Args+"\n")
- elif TMaptype == MOVETOFLAG:
- # MoveTo
- Args = "("+`fromState`+","+TokenArg+","+`TMap[0][1]`+")"
- Tofile.write(IND+VarName+".DFA.SetMap"+Args+"\n")
- else:
- raise FlowError, "unexpected else (2)"
- #enddef
-
- def Cleanup(self):
- Tofile = self.File
- RuleL = self.Gram.RuleL
- tokens = self.tokens
- VarName = self.Var
- IND = self.I
- FName = self.FName
-
- Tofile.write("\n\n"+IND+"# Clean up the grammar.\n")
- Tofile.write(IND+VarName+".CleanUp()\n")
-
- # if the Fname was given return the grammar as function result
- if FName != None:
- Tofile.write("\n\n"+IND+"# return the grammar.\n")
- Tofile.write(IND+"return "+VarName+"\n")
-
- Tofile.write("\n\n"+IND+"# WARNINGS ****************************** \n")
- Tofile.write(IND+"# You must bind the following rule names \n")
- Tofile.write(IND+"# to reduction interpretation functions \n")
- for R in RuleL:
- Tofile.write(IND+"# "+VarName+".Bind("+`R.Name`+", ??function??)\n")
- Tofile.write(IND+"#(last rule)\n")
-
- Tofile.write("\n\n"+IND+"# WARNINGS ****************************** \n")
- Tofile.write(IND+"# You must bind the following terminals \n")
- Tofile.write(IND+"# to regular expressions and interpretation functions \n")
- warningPrinted = 0
- for tok in tokens:
- (kind, name) = tok
- if kind == TERMFLAG and tok != ENDOFFILETOKEN:
- Tofile.write(IND+"# "+VarName+\
- ".Addterm("+`name`+", ??regularExp??, ??function??)\n")
- warningPrinted = 1
- if not warningPrinted:
- Tofile.write(IND+"# ***NONE** \n")
- Tofile.write(IND+"#(last terminal)\n")
- Tofile.write(IND+"# ******************************END RECONSTRUCTION\n")
- #enddef
- #endclass
-
- # reconstruction using marshalling to a file
- # encodes internal structures for grammar using marshal-able
- # objects. Final marshalling to the file is done at CleanUp()
- # storing one big tuple.
- #
- class marshalReconstruct(Reconstruct):
-
- def __init__(self, Grammar, Tofile):
- self.Gram = Grammar
- self.File = Tofile
- # should archive self.tokens structure
- self.MakeTokenArchives()
- # archive this
- self.CaseSensitivity = Grammar.LexD.isCaseSensitive()
-
- def PutLex(self):
- LexD = self.Gram.LexD
- # archive these
- self.punct = LexD.punctuationlist
- self.comments = LexD.commentstrings
-
- def PutRules(self):
- # archive this
- self.Root = self.Gram.DFA.root_nonTerminal
- # make a list of tuples that can be used with
- # rule = apply(ParseRule, tuple[1])
- # rule.Name = tuple[0]
- Rules = self.Gram.RuleL
- nrules = len(Rules)
- RuleTuples = [None] * nrules
- for i in range(nrules):
- rule = Rules[i]
- RuleTuples[i] = (rule.Name, rule.components())
- #archive this
- self.RuleTups = RuleTuples
-
- def PutTransitions(self):
- keys = self.keys
- tokenToInt = self.tokInt
- StateTokenMap = self.Gram.DFA.StateTokenMap
-
- # archive this
- self.MaxStates = self.Gram.DFA.maxState
-
- # create two lists,
- # one for reductions with contents (fromState, tokennumber, rulenum)
- # one for movetos with contents (fromstate, tokennumber, tostate)
- # (note: token number not token itself to allow sharing)
- # to allow arbitrary growing, first use dicts:
- reductDict = {}
- nreducts = 0
- moveToDict = {}
- nmoveTos = 0
- for key in self.keys:
- (fromState, TokenRep) = key
- TokenIndex = tokenToInt[TokenRep]
- TMap = StateTokenMap[key]
- TMaptype = TMap[0][0]
- if TMaptype == REDUCEFLAG:
- rulenum = TMap[0][1]
- reductDict[nreducts] = (fromState, TokenIndex, rulenum)
- nreducts = nreducts + 1
- elif TMaptype == MOVETOFLAG:
- ToState = TMap[0][1]
- moveToDict[nmoveTos] = (fromState, TokenIndex, ToState)
- nmoveTos = nmoveTos + 1
- else:
- raise FlowError, "unexpected else"
- #endfor
- # translate dicts to lists
- reducts = [None] * nreducts
- for i in range(nreducts):
- reducts[i] = reductDict[i]
- moveTos = [None] * nmoveTos
- for i in range(nmoveTos):
- moveTos[i] = moveToDict[i]
-
- # archive these
- self.reducts = reducts
- self.moveTos = moveTos
-
- # this is the function that does the marshalling
- def Cleanup(self):
- import marshal
- # make the big list to marshal
- BigList = [None] * 9
- BigList[0] = self.tokens
- BigList[1] = self.punct
- BigList[2] = self.comments
- BigList[3] = self.RuleTups
- BigList[4] = self.MaxStates
- BigList[5] = self.reducts
- BigList[6] = self.moveTos
- BigList[7] = self.Root
- BigList[8] = self.CaseSensitivity
- # dump the big list to the file
- marshal.dump( BigList, self.File )
-
- #end class
-
- #######################testing stuff
- if RUNTESTS:
- def echo(x): return x
-
- # simple grammar stolen from a text
- LD0 = kjParser.LexDictionary()
- id = LD0.terminal("id","id",echo)
- plus = LD0.punctuation("+")
- star = LD0.punctuation("*")
- oppar = LD0.punctuation("(")
- clpar = LD0.punctuation(")")
- equals = LD0.punctuation("=")
- E = kjParser.nonterminal("E")
- T = kjParser.nonterminal("T")
- Tp = kjParser.nonterminal("Tp")
- Ep = kjParser.nonterminal("Ep")
- F = kjParser.nonterminal("F")
- rule1 = kjParser.ParseRule( E, [ T, Ep ] )
- rule2 = kjParser.ParseRule( Ep, [ plus, T, Ep ] )
- rule3 = kjParser.ParseRule( Ep, [ ] )
- rule4 = kjParser.ParseRule( T, [ F, Tp ] )
- rule5 = kjParser.ParseRule( Tp, [ star, F, Tp ] )
- rule6 = kjParser.ParseRule( Tp, [ ] )
- rule7 = kjParser.ParseRule( F, [ oppar, E, clpar ] )
- rule8 = kjParser.ParseRule( F, [ id ] )
-
- rl0 = [ rule1, rule2, rule3, rule4, rule5, rule6, rule7,rule8]
- rs0 = ruleset(E, rl0)
- rs0.CompFirst()
- Firstpairs = kjSet.GetPairs(rs0.First)
- rs0.CompFollow()
- Followpairs = kjSet.GetPairs(rs0.Follow)
- rs0.CompSLRNFA()
- NFA0 = rs0.SLRNFA
- rs0.CompDFA()
- rs0.SLRFixDFA()
- DFA0 = rs0.DFA
-
- class dummy: pass
- ttt0 = dummy()
-
- def TESTDFA( STRING , ttt, DFA, Rulelist, DOREDUCTIONS = 1):
- ttt.STRING = STRING
- #ttt.List = kjParser.LexList(LD0, ttt.STRING)
- ttt.Stream = kjParser.LexStringWalker( ttt.STRING, LD0 )
- ttt.Stack = {-1:0}# Walkers.SimpleStack()
- ttt.ParseObj = kjParser.ParserObj( Rulelist, \
- ttt.Stream, DFA, ttt.Stack,DOREDUCTIONS)
- ttt.RESULT = ttt.ParseObj.GO()
- #ttt.Stack.Dump(10)
- return ttt.RESULT
-
- def TESTDFA0( STRING , DOREDUCTIONS = 1):
- return TESTDFA( STRING, ttt0, DFA0, rl0, DOREDUCTIONS )
-
- TESTDFA0( " id + id * id ")
-
- # an even simpler grammar
- S = kjParser.nonterminal("S")
- M = kjParser.nonterminal("M")
- A = kjParser.nonterminal("A")
- rr1 = kjParser.ParseRule( S, [M] )
- #rr2 = kjParser.ParseRule( A, [A, plus, M])
- #rr3 = kjParser.ParseRule( A, [M], echo)
- #rr4 = kjParser.ParseRule( M, [M, star, M])
- rr5 = kjParser.ParseRule( M, [oppar, M, clpar])
- rr6 = kjParser.ParseRule( M, [id])
- rl1 = [rr1,rr5,rr6]
- rs1 = ruleset(S, rl1)
- rs1.CompFirst()
- rs1.CompFollow()
- rs1.CompSLRNFA()
- rs1.CompDFA()
- rs1.SLRFixDFA()
- DFA1 = rs1.DFA
-
- ttt1=dummy()
-
- def TESTDFA1( STRING , DOREDUCTIONS = 1):
- return TESTDFA( STRING, ttt1, DFA1, rl1, DOREDUCTIONS )
-
- X = kjParser.nonterminal("X")
- Y = kjParser.nonterminal("Y")
- RX = kjParser.ParseRule( X, [ oppar, Y, clpar ] )
- RY = kjParser.ParseRule( Y, [] )
- rl2 = [RX,RY]
- rs2 = ruleset(X, rl2)
- rs2.CompFirst()
- rs2.CompFollow()
- rs2.CompSLRNFA()
- rs2.CompDFA()
- rs2.SLRFixDFA()
- DFA2 = rs2.DFA
-
- ttt2 = dummy()
- def TESTDFA2( STRING, DOREDUCTIONS = 1):
- return TESTDFA( STRING, ttt2, DFA2, rl2, DOREDUCTIONS )
-
- # the following grammar should fail to be slr
- # (Aho,Ullman p. 213)
-
- S = kjParser.nonterminal("S")
- L = kjParser.nonterminal("L")
- R = kjParser.nonterminal("R")
- RS1 = kjParser.ParseRule( S, [L, equals, R] )
- RS2 = kjParser.ParseRule( S, [R], echo )
- RL1 = kjParser.ParseRule( L, [star, R])
- RL2 = kjParser.ParseRule( L, [id])
- RR1 = kjParser.ParseRule( R, [L] )
- rs3 = ruleset(S, [RS1,RS2,RL1,RL2,RR1])
- rs3.CompFirst()
- rs3.CompFollow()
- rs3.CompSLRNFA()
- rs3.CompDFA()
- #rs3.SLRFixDFA() # should fail and does.
-
- # testing RULEGRAM
- ObjG = NullCGrammar()
- ObjG.Addterm("id","id",echo)
- ObjG.Nonterms("T E Ep F Tp")
- ObjG.Keywords("begin end")
- ObjG.punct("+*()")
- ObjG.comments(["--.*\n"])
- # PROBLEM WITH COMMENTS???
- Rulestr = """
- ## what a silly grammar!
- T ::
- @R One :: T >> begin E end
- @R Three :: E >>
- @R Two :: E >> E + T
- @R Four :: E >> ( T )
- """
- RL = RULEGRAM.DoParse1( Rulestr, ObjG )
-